library(ggplot2)
library(Seurat)
library(STutility)
library(Matrix)
library(Rcpp)
library(harmony)
library(patchwork)
library(sctransform)
library(png)
library(dplyr)
library(gprofiler2)
library(scCATCH)
library(tidyr)
library(VennDiagram)
# library(cellassign) library(SingleCellExperiment)
# library(tensorflow) library(scater)
This analysis pipeline works under the assumption that the sample files (Control and Tumor) are stored in each individual folder, one for the control sample, one for the tumor. It will create a list containing all the paths needed for downstream analysis using Stutility and Seurat. A global path needs to be provided (which contains the two sample folders), as well as the id’s for the control and tumor samples.
global.path <- "/mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33" # Replace with data path
setwd(global.path) # this sets the file path as the working directory
idControl <- "S33-C-2" # Name of the control sample
idTumor <- "S33-T-2" # Name of the tumor sample
In this table we create a data.frame that will contain the to the paths we need as input for our analysis (samples, spot files, images, .json), so we can later make it iterable and run our analysis inside the R markdown file.
samplePaths <- list.files(global.path, full.names = T)
infoTable <- data.frame()
for (sample in 1:length(samplePaths)) {
newDf <- data.frame(samples = list.files(samplePaths[sample],
full.names = T, pattern = ".h5"), spotfiles = list.files(samplePaths[sample],
full.names = T, pattern = ".csv"), imgs = list.files(samplePaths[sample],
full.names = T, pattern = ".png"), json = list.files(samplePaths[sample],
full.names = T, pattern = ".json"))
infoTable <- rbind(infoTable, newDf)
}
se <- InputFromTable(infotable = infoTable, platform = "Visium")
## Using spotfiles to remove spots outside of tissue
## Loading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-C-2/filtered_feature_bc_matrix.h5 count matrix from a 'Visium' experiment
## Loading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-T-2/filtered_feature_bc_matrix.h5 count matrix from a 'Visium' experiment
##
## ------------- Filtering (not including images based filtering) --------------
## Spots removed: 0
## Genes removed: 16692
## Saving capture area ranges to Staffli object
## After filtering the dimensions of the experiment is: [19909 genes, 1850 spots]
se$sample_id <- paste0("sample_", GetStaffli(se)@meta.data$sample)
se$orig.ident <- "S33"
st.object <- GetStaffli(se)
st.object
## An object of class Staffli
## 1850 spots across 2 samples.
For an initial view of the distribution of the features along the spots, we use the “ST.FeaturePlot” command.
ST.FeaturePlot(se, features = c("nFeature_RNA"), palette = "Spectral",
ncol = 2, pt.size = 2.5)
ST.FeaturePlot(se, features = c("nCount_RNA"), palette = "Spectral",
ncol = 2, pt.size = 2.5)
# se_subsetControl <- InputFromTable(infoTable[1,])
se_subsetControl <- SubsetSTData(se, expression = sample_id %in%
"sample_1")
se_subsetControl <- SetIdent(se_subsetControl, value = idControl)
se_subsetControl$sample_id <- idControl
se_subsetControl
## An object of class Seurat
## 19909 features across 1290 samples within 1 assay
## Active assay: RNA (19909 features, 0 variable features)
# se_subsetTumor <- InputFromTable(infoTable[2,])
se_subsetTumor <- SubsetSTData(se, expression = sample_id %in%
"sample_2")
se_subsetTumor <- SetIdent(se_subsetTumor, value = idTumor)
se_subsetTumor$sample_id <- idTumor
se_subsetTumor
## An object of class Seurat
## 19909 features across 560 samples within 1 assay
## Active assay: RNA (19909 features, 0 variable features)
This is unused by now. this generates feature and count distribution in the form of histograms.
generateQCplots <- function(sampleSubset, title, color) {
p1 <- ggplot() + geom_histogram(data = sampleSubset[[]],
aes(nFeature_RNA), fill = color, alpha = 0.7, bins = 100) +
ggtitle("Unique genes per spot")
p2 <- ggplot() + geom_histogram(data = sampleSubset[[]],
aes(nCount_RNA), fill = color, alpha = 0.7, bins = 100) +
ggtitle("Total counts per spots")
gene_attr <- data.frame(nUMI = Matrix::rowSums(sampleSubset@assays$RNA@counts),
nSpots = Matrix::rowSums(sampleSubset@assays$RNA@counts >
0))
p3 <- ggplot() + geom_histogram(data = gene_attr, aes(nUMI),
fill = color, alpha = 0.7, bins = 100) + scale_x_log10() +
ggtitle("Total counts per gene (log10 scale)")
p4 <- ggplot() + geom_histogram(data = gene_attr, aes(nSpots),
fill = color, alpha = 0.7, bins = 100) + ggtitle("Total spots per gene")
(p1 - p2)/(p3 - p4) + plot_annotation(title = title)
}
# mitochondrial genes
mt.genesControl <- grep(pattern = "^MT-", x = rownames(se_subsetControl),
value = TRUE)
se_subsetControl$percent.mito <- (Matrix::colSums(se_subsetControl@assays$RNA@counts[mt.genesControl,
])/Matrix::colSums(se_subsetControl@assays$RNA@counts)) *
100
# VlnPlot(se_subsetControl, features = c('nCount_RNA',
# 'nFeature_RNA', 'percent.mito'), pt.size = 0.1, ncol = 3)
# + plot_annotation(paste('QC violin plot Control: ',
# idControl))
mt.genesTumor <- grep(pattern = "^MT-", x = rownames(se_subsetTumor),
value = TRUE)
se_subsetTumor$percent.mito <- (Matrix::colSums(se_subsetTumor@assays$RNA@counts[mt.genesTumor,
])/Matrix::colSums(se_subsetTumor@assays$RNA@counts)) * 100
# VlnPlot(se_subsetTumor, features = c('nCount_RNA',
# 'nFeature_RNA', 'percent.mito'), pt.size = 0.1, ncol = 3,
# cols = 'cornflowerblue') + plot_annotation(paste('QC
# violin plot Tumor: ', idTumor))
# ribosomal genes
rb.genesControl <- grep(pattern = "^RP[SL]", x = rownames(se_subsetControl),
value = TRUE)
se_subsetControl$percent.ribo <- (Matrix::colSums(se_subsetControl@assays$RNA@counts[rb.genesControl,
])/Matrix::colSums(se_subsetControl@assays$RNA@counts)) *
100
VlnPlot(se_subsetControl, features = c("nCount_RNA", "nFeature_RNA",
"percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4) +
plot_annotation(paste("QC violin plot Control: ", idControl))
rb.genesTumor <- grep(pattern = "^RP[SL]", x = rownames(se_subsetTumor),
value = TRUE)
se_subsetTumor$percent.ribo <- (Matrix::colSums(se_subsetTumor@assays$RNA@counts[rb.genesTumor,
])/Matrix::colSums(se_subsetTumor@assays$RNA@counts)) * 100
VlnPlot(se_subsetTumor, features = c("nCount_RNA", "nFeature_RNA",
"percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4,
cols = "cornflowerblue") + plot_annotation(paste("QC violin plot Tumor: ",
idTumor))
merged <- MergeSTData(se_subsetControl, se_subsetTumor)
VlnPlot(merged, features = c("nCount_RNA", "nFeature_RNA", "percent.mito",
"percent.ribo"), pt.size = 0.1, ncol = 4, group.by = "sample_id",
cols = c("brown3", "cornflowerblue")) + plot_annotation("QC violin plot")
These plots will help us with the thresholding:
FeatureScatter(se_subsetControl, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
cols = "brown3") + plot_annotation(paste("Count vs Gene Scatter Plot ",
idControl))
FeatureScatter(se_subsetTumor, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
cols = "cornflowerblue") + plot_annotation(paste("Count vs Gene Scatter Plot ",
idTumor))
#### Count vs Mitochondrial content
p1 <- FeatureScatter(se_subsetControl, feature1 = "nCount_RNA",
feature2 = "percent.mito", cols = "brown3") + plot_annotation(paste("Count vs % Mitochondrial Scatter Plot ",
idControl))
p2 <- FeatureScatter(se_subsetTumor, feature1 = "nCount_RNA",
feature2 = "percent.mito", cols = "cornflowerblue") + plot_annotation(paste("Count vs % Mitochondrial Scatter Plot ",
idTumor))
p1 - p2
FeatureScatter(se_subsetControl, feature1 = "nCount_RNA", feature2 = "percent.ribo",
cols = "brown3") + plot_annotation(paste("Count vs % Ribosomal Scatter Plot ",
idControl)) - FeatureScatter(se_subsetTumor, feature1 = "nCount_RNA",
feature2 = "percent.ribo", cols = "cornflowerblue") + plot_annotation(paste("Count vs % Ribosomal Scatter Plot ",
idTumor))
This will generate a se subset, a Seurat class object subset that will take the infoTable as input for the corresponding sample and apply filtering based on the following parameters:
- Keeping only the genes that have presence in at least 5 capture spots and a total count value of >= 100 - Keeping the only that spots that contain >= 200 transcripts
Here we are filtering based on a minimum UMI count per spot, minimum gene per spot, and also filtering genes that are not seen in more than N spots.
# Control
se_subsetControlQC <- SubsetSTData(se_subsetControl, expression = nCount_RNA >
200)
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, expression = nFeature_RNA >
100)
cat("Spots removed: ", ncol(se_subsetControl) - ncol(se_subsetControlQC),
"\n")
## Spots removed: 527
# Tumor
se_subsetTumorQC <- SubsetSTData(se_subsetTumor, expression = nCount_RNA >
200)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, expression = nFeature_RNA >
100)
cat("Spots removed: ", ncol(se_subsetTumor) - ncol(se_subsetTumorQC),
"\n")
## Spots removed: 4
# Control
se_subsetControlQCmito <- SubsetSTData(se_subsetControlQC, expression = percent.mito <
15)
se_subsetControlQCribomito <- SubsetSTData(se_subsetControlQCmito,
expression = percent.ribo < 25)
cat("Spots removed: ", ncol(se_subsetControlQC) - ncol(se_subsetControlQCribomito),
"\n")
## Spots removed: 48
se_subsetControlQC <- se_subsetControlQCribomito
# Tumor
se_subsetTumorQCmito <- SubsetSTData(se_subsetTumorQC, expression = percent.mito <
15)
se_subsetTumorQCribomito <- SubsetSTData(se_subsetTumorQCmito,
expression = percent.ribo < 25)
cat("Spots removed: ", ncol(se_subsetTumorQC) - ncol(se_subsetTumorQCribomito),
"\n")
## Spots removed: 20
se_subsetTumorQC <- se_subsetTumorQCribomito
# Genes distributed in less than N spots
expMatControl <- GetAssayData(se_subsetControlQC, slot = "counts")
expMatTumor <- GetAssayData(se_subsetTumorQC, slot = "counts")
gene.countsControl <- Matrix::rowSums(expMatControl)
gene.countsTumor <- Matrix::rowSums(expMatTumor)
valid.genesControl <- gene.countsControl > 2
valid.genesTumor <- gene.countsTumor > 2
keep.genesControl <- rownames(se_subsetControlQC)[valid.genesControl]
keep.genesTumor <- rownames(se_subsetTumorQC)[valid.genesTumor]
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, features = keep.genesControl)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, features = keep.genesTumor)
length(keep.genesControl)
## [1] 12635
length(keep.genesTumor)
## [1] 15029
# Filter MALAT1
keep.genesControl <- keep.genesControl[!(grepl("MALAT1", keep.genesControl))]
keep.genesTumor <- keep.genesTumor[!(grepl("MALAT1", keep.genesTumor))]
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, features = keep.genesControl)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, features = keep.genesTumor)
length(keep.genesControl)
## [1] 12634
length(keep.genesTumor)
## [1] 15028
# Filter RPL13
keep.genesControl <- keep.genesControl[!(grepl("RPL13", keep.genesControl))]
keep.genesTumor <- keep.genesTumor[!(grepl("RPL13", keep.genesTumor))]
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, features = keep.genesControl)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, features = keep.genesTumor)
length(keep.genesControl)
## [1] 12631
length(keep.genesTumor)
## [1] 15025
# Update with valid genes
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, features = keep.genesControl)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, features = keep.genesTumor)
# Control
VlnPlot(se_subsetControlQC, features = c("nCount_RNA", "nFeature_RNA",
"percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4) +
plot_annotation(paste("QC violin plot Control:", idControl,
"(filtered)"))
# Tumor
VlnPlot(se_subsetTumorQC, features = c("nCount_RNA", "nFeature_RNA",
"percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4,
cols = "cornflowerblue") + plot_annotation(paste("QC violin plot Tumor:",
idTumor, "(filtered)"))
mergedQC <- MergeSTData(se_subsetControlQC, se_subsetTumorQC)
VlnPlot(mergedQC, features = c("nCount_RNA", "nFeature_RNA",
"percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4,
group.by = "sample_id", cols = c("brown3", "cornflowerblue")) +
plot_annotation("QC violin plot (filtered)")
ST.FeaturePlot(se_subsetControlQC, features = c("nCount_RNA"),
palette = "Spectral", ncol = 1, pt.size = 2.5) + plot_annotation(paste("Count spatial plot: ",
idControl, "(filtered)"))
ST.FeaturePlot(se_subsetTumorQC, features = c("nCount_RNA"),
palette = "Spectral", ncol = 1, pt.size = 2.5) + plot_annotation(paste("Count spatial plot: ",
idTumor, "(filtered)"))
For simplicity, we are raplacing the name of our QC’d seurat object subsets to something more simple:
seControl <- se_subsetControlQC
seTumor <- se_subsetTumorQC
# Control
widthControl <- dim(readPNG(infoTable$imgs[1]))[2]
seControl <- LoadImages(seControl, time.resolve = TRUE, verbose = TRUE,
xdim = widthControl)
## Loading images for 1 samples:
## Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-C-2/tissue_hires_image.png for sample 1 ...
## Scaling down sample 1 image from 1957x2000 pixels to 1957x2000 pixels
controlPlot <- FeatureOverlay(seControl, features = "nCount_RNA",
pt.size = 2.5, palette = "Spectral", type = "raw", pt.alpha = 0.6)
se_subsetControl <- LoadImages(se_subsetControl, time.resolve = TRUE,
verbose = TRUE, xdim = widthControl)
## Loading images for 1 samples:
## Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-C-2/tissue_hires_image.png for sample 1 ...
## Scaling down sample 1 image from 1957x2000 pixels to 1957x2000 pixels
controlPlotUnfiltered <- FeatureOverlay(se_subsetControl, features = "nCount_RNA",
pt.size = 2.5, palette = "Spectral", type = "raw", pt.alpha = 0.6)
# Tumor
widthTumor <- dim(readPNG(infoTable[2, ]$imgs))[1]
seTumor <- LoadImages(seTumor, time.resolve = TRUE, verbose = TRUE,
xdim = widthTumor)
## Loading images for 1 samples:
## Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-T-2/tissue_hires_image.png for sample 1 ...
## Scaling down sample 1 image from 2000x1916 pixels to 1916x1836 pixels
tumorPlot <- FeatureOverlay(seTumor, features = "nCount_RNA",
pt.size = 2.5, palette = "Spectral", type = "raw", pt.alpha = 0.6)
se_subsetTumor <- LoadImages(se_subsetTumor, time.resolve = TRUE,
verbose = TRUE, xdim = widthTumor)
## Loading images for 1 samples:
## Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-T-2/tissue_hires_image.png for sample 1 ...
## Scaling down sample 1 image from 2000x1916 pixels to 1916x1836 pixels
tumorPlotUnfiltered <- FeatureOverlay(se_subsetTumor, features = "nCount_RNA",
pt.size = 2.5, palette = "Spectral", pt.alpha = 0.6, type = "raw")
(controlPlotUnfiltered + ggtitle("Overlay plot S33-C-2 (unfiltered)"))
(controlPlot + ggtitle("Overlay plot S33-C-2 (filtered)"))
(tumorPlotUnfiltered + ggtitle("Overlay plot S33-T-2 (unfiltered)"))
(tumorPlot + ggtitle("Overlay plot S33-T-2 (filtered)"))
Data normalization is made using the SCTransform function included in Seurat. The one being used is Variance Stabilized Transformation (Hafemeister & Satija, 2019).
# Control
seControl <- SCTransform(seControl, return.only.var.genes = FALSE)
seControl
sprintf("the default assay in use is: %s", DefaultAssay(seControl))
# Tumor
seTumor <- SCTransform(seTumor, return.only.var.genes = FALSE)
seTumor
sprintf("the default assay in use is: %s", DefaultAssay(seTumor))
We can later use this function in the clusters to extract the top genes in each cluster
get_top20Genes <- function(object, title) {
C = object@assays$SCT@counts # WE ARE USING THE NORMALIZED COUNTS
C@x = C@x/rep.int(colSums(C), diff(C@p))
most_expressed <- order(Matrix::rowSums(C), decreasing = T)[20:1]
boxplot(as.matrix(t(C[most_expressed, ])), cex = 0.1, las = 1,
xlab = "% total count per cell", col = (scales::hue_pal())(20)[20:1],
horizontal = TRUE)
title(title)
}
pC <- get_top20Genes(seControl, paste("Top 20 expressed genes: ",
idControl))
pC
## NULL
pT <- get_top20Genes(seTumor, paste("Top 20 expressed genes: ",
idTumor))
pT
## NULL
# Merge both se objects
mergedPCA <- MergeSTData(seControl, seTumor)
mergedPCA <- SCTransform(mergedPCA, return.only.var.genes = FALSE)
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 14256 by 1251
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 1251 cells
## Found 61 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 14256 genes
## Computing corrected count matrix for 14256 genes
## Calculating gene attributes
## Wall clock passed: Time difference of 13.83704 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Centering data matrix
## Set default assay to SCT
mergedPCA <- RunPCA(mergedPCA, assay = "SCT")
## PC_ 1
## Positive: PIP, SCGB2A2, DCN, S100A6, KRT15, ITM2B, KRT17, KRT14, TAGLN, RPL34
## TXNIP, MYL9, CST3, GSN, STC2, CFD, ACTA2, FTH1, IGKC, RPL17
## A2M, MT1X, VIM, RPS3A, IGHA1, MIR205HG, SPARCL1, PTN, SAA1, RPS27A
## Negative: CD24, CHCHD2, PKM, CBX3, PPIA, CYCS, COL1A1, MT-ATP6, PEG10, PSAP
## FN1, HSP90AA1, RPS24, MT-ND2, AZGP1, COL1A2, MT-CO3, VOPP1, MT-ND4, MT-ND3
## SCD, COL3A1, RPS29, POSTN, SPARC, CRIP2, CTSD, MT-ND1, APOD, COMP
## PC_ 2
## Positive: PIP, AZGP1, MGP, KRT15, SCGB2A2, STC2, RPL17, RPS4X, PTN, TFF3
## TACSTD2, MIR205HG, AGR2, SERPINA3, SERPINA1, MUCL1, PI15, ITM2B, KRT14, KRT17
## RPL34, EVL, ELOVL5, MT1X, SEC14L2, CHCHD2, WFDC2, RPL5, SCGB3A1, RPS27A
## Negative: CCDC80, COL1A2, MT-ND3, COL1A1, CFD, MT-ND4, C3, BGN, FSTL1, TNXB
## CD74, IGFBP7, RNASE1, DCN, COL6A1, TMSB4X, COL6A3, COL6A2, B2M, HLA-E
## FTL, NEAT1, C1QA, VIM, SPARC, GPX3, MMP2, CALD1, HLA-B, SAMHD1
## PC_ 3
## Positive: CHCHD2, DCN, RPS24, PEG10, IGHA1, IGKC, GSN, IGFBP6, CFD, VOPP1
## CXCL12, C3, FBLN1, TNXB, CD24, JCHAIN, PSAP, CLDN5, PLAC9, GPX3
## C1R, S100A6, C1S, EGFL7, SERPING1, VIM, ADH1B, CBX3, DEPP1, IFITM2
## Negative: TCN1, MUC1, CALML5, CTSD, SERPINA3, LTF, S100A9, IGF1R, CRIP1, CLEC3A
## S100A7, GDF9, LRG1, SLC30A8, APOD, LRRC6, SHROOM1, RPS29, AGR3, SKIL
## COX6C, LAPTM4B, CRIP2, PYGL, MT-ND4, MTRNR2L12, ESR1, SERPINA1, CD59, EVL
## PC_ 4
## Positive: DCN, APOD, CRIP1, S100A9, CFD, IGF1R, IGHA1, LTF, CLEC3A, TCN1
## FBLN1, IGFBP6, FTL, GDF9, CRIP2, CALML5, C3, MUC1, RPS29, LAPTM4B
## SLC30A8, S100A7, S100A6, LRG1, AEBP1, LRRC6, C1S, VIM, C1R, SHROOM1
## Negative: CCDC80, COL1A2, COL1A1, NEAT1, SCGB2A2, BGN, CALD1, FSTL1, PIP, MT-ND3
## SAMHD1, PDK4, NAP1L1, ACTA2, CD24, KRT15, STC2, MAP1B, POSTN, PTMS
## COL6A3, EGR1, ANKRD12, PRRC2C, RBM25, KLF6, THY1, AL627171.2, HSP90AA1, NFIX
## PC_ 5
## Positive: FABP4, SAA1, FHL1, MT-ND4, CFD, PLIN4, PLIN1, CD36, MT-ND3, AL627171.2
## CCDC80, ADIRF, CAV1, ADH1B, GPX3, AOC3, LPL, CAVIN2, LIPE, GPD1
## TNS1, MGLL, PNPLA2, MT-ND1, MT-CO3, RBP4, NEAT1, SPTBN1, AQP1, PDK4
## Negative: COL1A1, FN1, COL1A2, COL3A1, POSTN, HLA-DRA, TAGLN, APOE, KRT17, KRT14
## SFRP2, HLA-DRB1, LUM, CCN2, ACTA2, AEBP1, COL6A1, CD74, COL6A2, HLA-DPA1
## MYL9, BGN, TPM2, SPARC, LYZ, KRT5, ACTG2, CNN1, THBS1, COL12A1
head(Loadings(mergedPCA, reduction = "pca")[, 1:5])
plotPCA <- DimPlot(mergedPCA, reduction = "pca", group.by = "sample_id",
cols = c("brown3", "cornflowerblue")) + ggtitle("PCA plot (Control vs Tumor)")
plotVln <- VlnPlot(mergedPCA, features = "PC_1", group.by = "sample_id",
cols = c("brown3", "cornflowerblue"))
plotPCA - plotVln
ElbowPlot(mergedPCA) + ggtitle("Elbow Plot")
mergedPCA <- RunHarmony(mergedPCA, group.by.vars = "sample_id",
assay.use = "SCT", reduction = "pca", plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony converged after 2 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity
plotPCA <- DimPlot(mergedPCA, reduction = "harmony", group.by = "sample_id",
cols = c("brown3", "cornflowerblue")) + ggtitle("PCA plot (Control vs Tumor)")
plotVln <- VlnPlot(mergedPCA, features = "harmony_1", group.by = "sample_id",
cols = c("brown3", "cornflowerblue"))
plotPCA - plotVln
pam50 <- c("UBE2T", "BIRC5", "NUF2", "CDC6", "CCNB1", "TYMS",
"MYBL2", "CEP55", "MELK", "NDC80", "RRM2", "UBE2C", "CENPF",
"PTTG1", "EXO1", "ORC6L", "ANLN", "CCNE1", "CDC20", "MKI67",
"KIF2C", "ACTR3B", "MYC", "EGFR", "KRT5", "PHGDH", "CDH3",
"MIA", "KRT17", "FOXC1", "SFRP1", "KRT14", "ESR1", "SLC39A6",
"BAG1", "MAPT", "PGR", "CXXC5", "MLPH", "BCL2", "MDM2", "NAT1",
"FOXA1", "BLVRA", "MMP11", "GPR160", "FGFR4", "GRB7", "TMEM45B",
"ERBB2")
for (gene in pam50) {
plot <- tryCatch(ST.FeaturePlot(mergedPCA, features = gene,
pt.size = 2.5, palette = "Spectral", ncol = 2), error = function(e) NA)
print(plot)
}
## [1] NA
(https://doi.org/10.1186/s13058-019-1242-9)
# NORAD (LINC00657); COL1A2; SCD
ST.FeaturePlot(mergedPCA, features = c("NORAD"), pt.size = 2.5,
palette = "Spectral", ncol = 2)
ST.FeaturePlot(mergedPCA, features = c("COL1A2"), pt.size = 2.5,
palette = "Spectral", ncol = 2)
ST.FeaturePlot(mergedPCA, features = c("SCD"), pt.size = 2.5,
palette = "Spectral", ncol = 2)
VlnPlot(mergedPCA, features = pam50, same.y.lims = TRUE, group.by = "sample_id",
ncol = 10)
## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: ORC6L
VlnPlot(mergedPCA, features = c("NORAD", "COL1A2", "SCD"), same.y.lims = TRUE,
group.by = "sample_id", ncol = 3)
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).
mergedPCA <- FindNeighbors(mergedPCA, reduction = "harmony",
dims = 1:8) ## BASED ON ELBOW PLOT
## Computing nearest neighbor graph
## Computing SNN
mergedPCA <- FindClusters(mergedPCA, verbose = FALSE, resolution = 0.2) ## CLUSTER FORMATION RESOLUTION
mergedPCA <- RunUMAP(mergedPCA, reduction = "harmony", dims = 1:8) ## BASED ON ELBOW PLOT
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:32:08 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:32:08 Read 1251 rows and found 8 numeric columns
## 17:32:08 Using Annoy for neighbor search, n_neighbors = 30
## 17:32:08 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:32:08 Writing NN index file to temp file /tmp/Rtmpi7ML7W/file716e70f7d233
## 17:32:08 Searching Annoy index using 1 thread, search_k = 3000
## 17:32:09 Annoy recall = 100%
## 17:32:09 Commencing smooth kNN distance calibration using 1 thread
## 17:32:10 Initializing from normalized Laplacian + noise
## 17:32:10 Commencing optimization for 500 epochs, with 49376 positive edges
## 17:32:12 Optimization finished
plt.UMAP.merged <- DimPlot(mergedPCA, reduction = "umap", group.by = "sample_id") +
ggtitle(paste("UMAP plot Merged(", idControl, "and", idTumor,
")"))
plt.UMAP.merged
plt.UMAP.merged <- DimPlot(mergedPCA, reduction = "umap", split.by = "sample_id") +
ggtitle(paste("UMAP plot Merged(", idControl, "and", idTumor,
")"))
plt.UMAP.merged
plt.spatialUMAP.merged <- ST.FeaturePlot(mergedPCA, features = "seurat_clusters",
pt.size = 2.6, palette = "Spectral", ncol = 2) + ggtitle(paste("Spatial UMAP plot Merged(",
idControl, "and", idTumor, ")"))
plt.spatialUMAP.merged
# Control
seControl <- SubsetSTData(mergedPCA, expression = sample_id %in%
idControl)
# Tumor
seTumor <- SubsetSTData(mergedPCA, expression = sample_id %in%
idTumor)
# Control
ST.FeaturePlot(seControl, features = "seurat_clusters", pt.size = 2.6,
split.labels = T, ncol = 3) + ggtitle(paste("Spatial UMAP plot ",
idControl))
# Tumor
ST.FeaturePlot(seTumor, features = "seurat_clusters", pt.size = 2.6,
split.labels = T, ncol = 3) + ggtitle(paste("Spatial UMAP plot ",
idTumor))
# Control
widthControl <- dim(readPNG(infoTable$imgs[1]))[2]
seControl <- LoadImages(seControl, time.resolve = TRUE, verbose = TRUE,
xdim = widthControl)
## Loading images for 1 samples:
## Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-C-2/tissue_hires_image.png for sample 1 ...
## Scaling down sample 1 image from 1957x2000 pixels to 1957x2000 pixels
FeatureOverlay(seControl, features = "seurat_clusters", pt.size = 2.5,
palette = "Spectral", pt.alpha = 0.4, sample.label = FALSE) +
ggtitle(paste("Spatial overlay UMAP plot ", idControl))
# Tumor
widthTumor <- dim(readPNG(infoTable[2, ]$imgs))[1]
seTumor <- LoadImages(seTumor, time.resolve = TRUE, verbose = TRUE,
xdim = widthTumor)
## Loading images for 1 samples:
## Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-T-2/tissue_hires_image.png for sample 1 ...
## Scaling down sample 1 image from 2000x1916 pixels to 1916x1836 pixels
FeatureOverlay(seTumor, features = "seurat_clusters", pt.size = 2.5,
palette = "Spectral", pt.alpha = 0.4, sample.label = FALSE) +
ggtitle(paste("Spatial overlay UMAP plot ", idTumor))
Using FindMarkers function (Seurat). Default test = Wilcoxon Rank Sum test. Using a p-value threshold of 0.01
markersAll <- FindAllMarkers(mergedPCA, assay = "SCT", return.thresh = 0.01)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
markersAll %>%
group_by(cluster) %>%
top_n(n = 4, wt = avg_log2FC)
Here we are going to plot a heatmap of the 20 top expressed genes per cluster
top20 <- markersAll %>%
group_by(cluster) %>%
top_n(n = 20, wt = avg_log2FC)
DoHeatmap(mergedPCA, features = top20$gene, group.bar.height = 0.005,
slot = "scale.data") + plot_annotation("Heatmap of differentially expressed genes between clusters")
generateplotsDE <- function(cluster, ngenes, plot) {
cluster.markers <- FindMarkers(mergedPCA, ident.1 = cluster,
thresh.use = 0.01, only.pos = T) # here we extract markers for cluster N but only the up-regulated ones
# Most DE on cluster N
cluster.top <- rownames(cluster.markers)[1:ngenes]
# plot choice (because rmarkdown doesnt let us plot
# everything with 1 single function run)
if (plot == "violin") {
VlnPlot(mergedPCA, features = c(cluster.top), same.y.lims = TRUE,
ncol = 5) + theme(legend.position = "right")
} else if (plot == "umap") {
FeaturePlot(mergedPCA, features = cluster.top, ncol = 5)
} else if (plot == "spatial") {
ST.FeaturePlot(mergedPCA, features = cluster.top, palette = "Spectral",
pt.size = 2.5)
}
}
generateplotsDE(0, 10, "violin")
generateplotsDE(0, 10, "umap")
generateplotsDE(0, 5, "spatial")
generateplotsDE(1, 10, "violin")
generateplotsDE(1, 10, "umap")
generateplotsDE(1, 5, "spatial")
generateplotsDE(2, 10, "violin")
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).
generateplotsDE(2, 10, "umap")
generateplotsDE(2, 5, "spatial")
generateplotsDE(3, 10, "violin")
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).
generateplotsDE(3, 10, "umap")
generateplotsDE(3, 5, "spatial")
generateplotsDE(4, 10, "violin")
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).
generateplotsDE(4, 10, "umap")
generateplotsDE(4, 5, "spatial")
# Create marker subset for each cluster
markers0 <- subset(markersAll, cluster == "0") %>%
filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
pull(gene)
markers1 <- subset(markersAll, cluster == "1") %>%
filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
pull(gene)
markers2 <- subset(markersAll, cluster == "2") %>%
filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
pull(gene)
markers3 <- subset(markersAll, cluster == "3") %>%
filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
pull(gene)
markers4 <- subset(markersAll, cluster == "4") %>%
filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
pull(gene)
# markers5 <- subset(markersAll, cluster == '5') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers6 <- subset(markersAll, cluster == '6') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers7 <- subset(markersAll, cluster == '7') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers8 <- subset(markersAll, cluster == '8') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers9 <- subset(markersAll, cluster == '9') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers10 <- subset(markersAll, cluster == '10') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
Here we run FEA using gprofiler2, and the source we use is Gene Ontology: Biological Process (GO:BP)
go0 <- gost(query = markers0, organism = "hsapiens", significant = TRUE,
sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
"WP"))
go1 <- gost(query = markers1, organism = "hsapiens", significant = TRUE,
sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
"WP"))
go2 <- gost(query = markers2, organism = "hsapiens", significant = TRUE,
sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
"WP"))
go3 <- gost(query = markers3, organism = "hsapiens", significant = TRUE,
sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
"WP"))
go4 <- gost(query = markers4, organism = "hsapiens", significant = TRUE,
sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
"WP"))
# go5 <- gost(query = markers5, organism = 'hsapiens',
# significant = TRUE, sources = c('GO', 'KEGG', 'REAC',
# 'TF', 'MIRNA', 'CORUM', 'HP', 'HPA', 'WP')) go6 <-
# gost(query = markers6, organism = 'hsapiens', significant
# = TRUE, sources = c('GO', 'KEGG', 'REAC', 'TF', 'MIRNA',
# 'CORUM', 'HP', 'HPA', 'WP')) go7 <- gost(query =
# markers7, organism = 'hsapiens', significant = TRUE,
# sources = 'GO:BP') go8 <- gost(query = markers8, organism
# = 'hsapiens', significant = TRUE, sources = 'GO:BP') go9
# <- gost(query = markers9, organism = 'hsapiens',
# significant = TRUE, sources = 'GO:BP') go10 <- gost(query
# = markers10, organism = 'hsapiens', significant = TRUE,
# sources = 'GO:BP')
go0$result
go1$result
go2$result
go3$result
go4$result
# go5$result go6$result go7$result go8$result go9$result
# go10$result
VlnPlot(mergedPCA, features = pam50, same.y.lims = TRUE, ncol = 10)
## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: ORC6L
VlnPlot(mergedPCA, features = c("NORAD", "COL1A2", "SCD"), same.y.lims = TRUE,
ncol = 3)
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).
Subclustering healthy areas that span both samples to see how differently they cluster.
# dimensional reduction and batch correction
healthy <- SubsetSTData(mergedPCA, idents = c(0))
healthy <- RunPCA(healthy, assay = "SCT")
## PC_ 1
## Positive: DCN, CFD, C3, IGFBP6, S100A6, VIM, TNXB, GSN, EGFL7, IGFBP7
## CLDN5, CXCL12, SERPING1, FBLN1, TXNIP, GPX3, SPARCL1, C1S, PLAC9, C1R
## DEPP1, IGHA1, A2M, IFITM2, IFITM3, MYL9, ADIRF, AEBP1, CST3, IGKC
## Negative: CD24, PKM, AZGP1, CHCHD2, CBX3, COL1A1, MT-ATP6, PPIA, CYCS, HSP90AA1
## PEG10, MT-ND4, MT-CO3, COL1A2, MGP, MT-ND2, PSAP, MT-ND3, RPS24, CTSD
## VOPP1, POSTN, MTRNR2L12, MUC1, FN1, NEAT1, SCD, RPS29, SCN5A, PRRC2C
## PC_ 2
## Positive: PIP, RPL17, DCN, RPS27A, RPL34, MGP, KRT15, RPS8, IGHA1, RPL11
## AZGP1, RPL5, KRT17, RPS4X, ITM2B, MUCL1, KRT14, RPS3A, TFF3, RPL29
## S100A6, FTH1, CST3, RPS17, RPL30, RPSA, SERPINA3, RPL28, TACSTD2, STC2
## Negative: CCDC80, NEAT1, COL1A2, BGN, MT-ND3, FSTL1, SAMHD1, PDK4, COL1A1, MT-ND4
## CALD1, COL6A3, NAP1L1, EGR1, KLF6, PTMS, NFIX, MAP1B, CTSZ, EIF5B
## AL627171.2, PRRC2C, ANKRD12, FOS, RBM25, HSP90AA1, FKBP1A, TMSB4X, MTRNR2L12, THY1
## PC_ 3
## Positive: DCN, IGKC, C3, CCDC80, IGFBP6, FBLN1, SFRP4, IGHA1, TNXB, SFRP2
## C1S, C1R, MMP2, PLTP, CXCL12, SELENOP, AEBP1, LUM, JCHAIN, IGFBP4
## RARRES2, GPNMB, CFD, CTSK, SERPING1, CCN5, GSN, SFRP1, OGN, AKAP12
## Negative: AQP1, IGFBP7, CLDN5, PECAM1, SPARCL1, ACKR1, HLA-E, A2M, ACTA2, EGFL7
## MYL9, GNG11, IFITM2, EPAS1, IGFBP3, B2M, HSPG2, FLNA, VWF, CD93
## CD24, ADIRF, SNCG, ARAP3, FABP4, MMRN2, APOLD1, TAGLN, ADGRL4, IFITM1
## PC_ 4
## Positive: IGKC, IGHA1, JCHAIN, ACKR1, VWF, IGLC2, IGHG1, HLA-DRA, AQP1, C1S
## IGLC1, CD74, GNG11, IGHG3, FABP4, IGLC3, PECAM1, SPRY1, SERPING1, CEBPD
## IGHG4, B2M, MBNL1, TGFBR2, MED13L, HLA-B, IL1R1, ZNF385D, CALD1, EGFL7
## Negative: TNXB, MYL9, COL6A2, COL1A2, TAGLN, IGFBP6, COL1A1, ELN, A2M, MYH11
## TIMP2, FN1, C3, CPE, SERPINF1, CLDN5, AEBP1, MRC2, OGN, SFRP1
## SOD3, DPYSL2, DCN, MFAP4, CLEC3B, NEXN, DKK3, COL3A1, S100A4, CXCL12
## PC_ 5
## Positive: RNASE1, CFD, AQP1, ACKR1, CCL14, PLVAP, C1QA, TXNIP, PDK4, CD74
## SELENOP, C1QB, IFITM1, CTSC, CST3, TNXB, CLEC3B, S100A10, TGFBR2, TNFAIP2
## KLF4, C1QC, VWF, ROBO4, PLTP, FTL, CCN5, IFITM2, DST, REX1BD
## Negative: IGKC, JCHAIN, IGHA1, CLDN5, MYL9, ACTA2, FLNA, MYH11, TAGLN, IGFBP3
## BGN, DEPP1, MUSTN1, GPX3, MYLK, TIMP1, IGFBP6, COL18A1, LRP1, COL14A1
## CALD1, LIMS2, AEBP1, ADH1B, CD151, COL4A1, MYO1C, GAS6, SEMA3G, ID1
ElbowPlot(healthy) + ggtitle("Elbow Plot")
healthy <- RunHarmony(healthy, group.by.vars = "sample_id", assay.use = "SCT",
reduction = "pca", plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony converged after 3 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity
# clustering
healthy <- FindNeighbors(healthy, reduction = "harmony", dims = 1:6) ## BASED ON ELBOW PLOT
## Computing nearest neighbor graph
## Computing SNN
healthy <- FindClusters(healthy, verbose = FALSE, resolution = 0.2) ## CLUSTER FORMATION RESOLUTION
healthy <- RunUMAP(healthy, reduction = "harmony", dims = 1:6) ## BASED ON ELBOW PLOT
## 17:33:37 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:33:37 Read 577 rows and found 6 numeric columns
## 17:33:37 Using Annoy for neighbor search, n_neighbors = 30
## 17:33:37 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:33:37 Writing NN index file to temp file /tmp/Rtmpi7ML7W/file716e653e3678
## 17:33:37 Searching Annoy index using 1 thread, search_k = 3000
## 17:33:37 Annoy recall = 100%
## 17:33:38 Commencing smooth kNN distance calibration using 1 thread
## 17:33:38 Initializing from normalized Laplacian + noise
## 17:33:39 Commencing optimization for 500 epochs, with 21156 positive edges
## 17:33:40 Optimization finished
DimPlot(healthy, reduction = "umap") + ggtitle("Subclustering of healthy clusters")
ST.FeaturePlot(healthy, features = "seurat_clusters", pt.size = 2.6,
ncol = 2) + ggtitle("Mapping of healthy subclusters")
meta.data <- healthy[[]]
counts <- group_by(meta.data, sample_id, seurat_clusters) %>%
summarise(count = n())
## `summarise()` has grouped output by 'sample_id'. You can override using the `.groups` argument.
ggplot(counts, aes(sample_id, count, fill = seurat_clusters)) +
geom_col(position = "fill") + scale_y_continuous(labels = scales::percent) +
ggtitle("Cluster distribution across samples")
healthy <- SetIdent(healthy, value = "sample_id")
markersAll_healthy <- FindAllMarkers(healthy, assay = "SCT",
return.thresh = 0.01)
## Calculating cluster S33-C-2
## Calculating cluster S33-T-2
markersAll_healthy %>%
group_by(cluster) %>%
top_n(n = 4, wt = avg_log2FC)
top20_healthy <- markersAll_healthy %>%
group_by(cluster) %>%
top_n(n = 20, wt = avg_log2FC)
DoHeatmap(healthy, features = top20_healthy$gene, group.bar.height = 0.005,
slot = "scale.data", group.by = "sample_id") + plot_annotation("Heatmap of differentially expressed genes across healthy regions")
Cancer marker genes that are overexpressed in the healthy tissue area compared to the control sample.
top.healthy <- top20_healthy$gene[21:30]
VlnPlot(healthy, features = top.healthy, group.by = "sample_id",
assay = "SCT", ncol = 5, same.y.lims = TRUE) + plot_annotation("Differentially expressed cancer genes in healthy areas aross samples")
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).
# VlnPlot(healthy, features = c('PKM', 'CD24', 'FN1',
# 'CBX3'), group.by = 'sample_id') +
# plot_annotation('Differentially expressed cancer genes in
# healthy areas aross samples')
seTumor <- RunPCA(seTumor, verbose = FALSE, assay = "SCT")
ElbowPlot(seTumor) + ggtitle("Elbow plot")
seTumor <- FindNeighbors(seTumor, reduction = "pca", dims = 1:7) ## BASED ON ELBOW PLOT
## Computing nearest neighbor graph
## Computing SNN
seTumor <- FindClusters(seTumor, verbose = FALSE, resolution = 0.5) ## CLUSTER FORMATION RESOLUTION
seTumor <- RunUMAP(seTumor, reduction = "pca", dims = 1:7) ## BASED ON ELBOW PLOT
## 17:33:46 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:33:46 Read 536 rows and found 7 numeric columns
## 17:33:46 Using Annoy for neighbor search, n_neighbors = 30
## 17:33:46 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:33:46 Writing NN index file to temp file /tmp/Rtmpi7ML7W/file716e22826d92
## 17:33:46 Searching Annoy index using 1 thread, search_k = 3000
## 17:33:46 Annoy recall = 100%
## 17:33:46 Commencing smooth kNN distance calibration using 1 thread
## 17:33:47 Initializing from normalized Laplacian + noise
## 17:33:47 Commencing optimization for 500 epochs, with 20506 positive edges
## 17:33:48 Optimization finished
DimPlot(seTumor, reduction = "umap") + ggtitle("UMAP plot (Control-independent)")
ST.FeaturePlot(seTumor, features = "seurat_clusters", pt.size = 2.5,
ncol = 1)
markersAll_tumor <- FindAllMarkers(seTumor, assay = "SCT", return.thresh = 0.01)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
markersAll_tumor %>%
group_by(cluster) %>%
top_n(n = 4, wt = avg_log2FC)
top20 <- markersAll_tumor %>%
group_by(cluster) %>%
top_n(n = 20, wt = avg_log2FC)
DoHeatmap(seTumor, features = top20$gene, group.bar.height = 0.005,
slot = "scale.data") + plot_annotation("Heatmap of differentially expressed genes between clusters")
In this segment, we selected and merged cancer regions and performed DEA. For the modalities of cancer + control and cancer only, the clusters will be merged into cancer regions and cancer-devoid regions based on the previous marker genes. The DE of both modalities will be compared.
# Merging cancer clusters (by renaming)
seTumor_merged <- seTumor
new.cluster.ids <- c("healthy1", "tumor1", "tumor2", "tumor3",
"tumor4", "adipose")
names(new.cluster.ids) <- levels(seTumor_merged)
seTumor_merged <- RenameIdents(seTumor, new.cluster.ids)
tumor.vs.tumor.markers <- FindMarkers(seTumor_merged, ident.1 = c("tumor1",
"tumor2", "tumor3", "tumor4"), ident.2 = "healthy1", assay = "SCT")
top20_ci <- tumor.vs.tumor.markers %>%
top_n(n = 20, wt = avg_log2FC)
DoHeatmap(seTumor_merged, features = rownames(top20_ci), group.bar.height = 0.005,
slot = "scale.data") + plot_annotation("Grouped Tumor vs Healthy area (tumor-only)")
# Merging cancer and healthy clusters (by renaming)
merged_merged <- mergedPCA
new.cluster.ids <- c("healthy1", "tumor1", "healthy2", "tumor2",
"adipose")
names(new.cluster.ids) <- levels(merged_merged)
merged_merged <- RenameIdents(merged_merged, new.cluster.ids)
levels(merged_merged) <- c("healthy1", "healthy2", "tumor1",
"tumor2", "adipose")
# tumor.vs.control.markers <- FindMarkers(merged_merged,
# ident.1 = 'tumor', ident.2 = 'healthy')
tumor.vs.control.markers <- FindMarkers(merged_merged, ident.1 = c("tumor1",
"tumor2"), ident.2 = c("healthy1", "healthy2"))
top20_cd <- tumor.vs.control.markers %>%
top_n(n = 20, wt = avg_log2FC)
DoHeatmap(merged_merged, features = rownames(top20_cd), group.bar.height = 0.005,
slot = "scale.data") + plot_annotation("Grouped Tumor vs Healthy area (control-integrated dataset)")
cd <- c(rownames(tumor.vs.control.markers))
ci <- c(rownames(tumor.vs.tumor.markers))
x <- list(cd, ci)
# venn.diagram(list(cd,ci), filename= 'vennTest',
# category.names = c('Control-dependent' ,
# 'Control-Independent'), output=TRUE, disable.logging =
# TRUE)
v <- venn.diagram(x, filename = NULL, category.names = c("With Control",
"Tumour only"), disable.logging = TRUE, fill = c("blue",
"red"))
## INFO [2022-01-12 17:33:54] [[1]]
## INFO [2022-01-12 17:33:54] x
## INFO [2022-01-12 17:33:54]
## INFO [2022-01-12 17:33:54] $filename
## INFO [2022-01-12 17:33:54] NULL
## INFO [2022-01-12 17:33:54]
## INFO [2022-01-12 17:33:54] $category.names
## INFO [2022-01-12 17:33:54] c("With Control", "Tumour only")
## INFO [2022-01-12 17:33:54]
## INFO [2022-01-12 17:33:54] $disable.logging
## INFO [2022-01-12 17:33:54] [1] TRUE
## INFO [2022-01-12 17:33:54]
## INFO [2022-01-12 17:33:54] $fill
## INFO [2022-01-12 17:33:54] c("blue", "red")
## INFO [2022-01-12 17:33:54]
# have a look at the default plot
grid.newpage()
grid.draw(v)
# have a look at the names in the plot object v
lapply(v, names)
## [[1]]
## [1] "x" "y" "id" "id.lengths" "name"
## [6] "gp" "vp" "params"
##
## [[2]]
## [1] "x" "y" "id" "id.lengths" "name"
## [6] "gp" "vp" "params"
##
## [[3]]
## [1] "x" "y" "id" "id.lengths" "name"
## [6] "gp" "vp" "params"
##
## [[4]]
## [1] "x" "y" "id" "id.lengths" "name"
## [6] "gp" "vp" "params"
##
## [[5]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
##
## [[6]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
##
## [[7]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
##
## [[8]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
##
## [[9]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
# We are interested in the labels
lapply(v, function(i) i$label)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## [1] "129"
##
## [[6]]
## [1] "102"
##
## [[7]]
## [1] "156"
##
## [[8]]
## [1] "With Control"
##
## [[9]]
## [1] "Tumour only"
# Over-write labels (5 to 7 chosen by manual check of
# labels) in foo only
v[[5]]$label <- paste(setdiff(cd, ci), collapse = "\n")
# in baa only
v[[6]]$label <- paste(setdiff(ci, cd), collapse = "\n")
# intesection
v[[7]]$label <- paste(intersect(cd, ci), collapse = "\n")
# plot grid.newpage() grid.draw(v)
cd2 <- c(rownames(top20_cd))
ci2 <- c(rownames(top20_ci))
x <- list(cd, ci)
# venn.diagram(list(cd,ci), filename= 'vennTest',
# category.names = c('Control-dependent' ,
# 'Control-Independent'), output=TRUE, disable.logging =
# TRUE)
v <- venn.diagram(x, filename = NULL, category.names = c("With Control",
"Tumour only"), disable.logging = TRUE, fill = c("blue",
"red"))
## INFO [2022-01-12 17:33:54] [[1]]
## INFO [2022-01-12 17:33:54] x
## INFO [2022-01-12 17:33:54]
## INFO [2022-01-12 17:33:54] $filename
## INFO [2022-01-12 17:33:54] NULL
## INFO [2022-01-12 17:33:54]
## INFO [2022-01-12 17:33:54] $category.names
## INFO [2022-01-12 17:33:54] c("With Control", "Tumour only")
## INFO [2022-01-12 17:33:54]
## INFO [2022-01-12 17:33:54] $disable.logging
## INFO [2022-01-12 17:33:54] [1] TRUE
## INFO [2022-01-12 17:33:54]
## INFO [2022-01-12 17:33:54] $fill
## INFO [2022-01-12 17:33:54] c("blue", "red")
## INFO [2022-01-12 17:33:54]
# have a look at the default plot
grid.newpage()
grid.draw(v)
# have a look at the names in the plot object v
lapply(v, names)
## [[1]]
## [1] "x" "y" "id" "id.lengths" "name"
## [6] "gp" "vp" "params"
##
## [[2]]
## [1] "x" "y" "id" "id.lengths" "name"
## [6] "gp" "vp" "params"
##
## [[3]]
## [1] "x" "y" "id" "id.lengths" "name"
## [6] "gp" "vp" "params"
##
## [[4]]
## [1] "x" "y" "id" "id.lengths" "name"
## [6] "gp" "vp" "params"
##
## [[5]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
##
## [[6]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
##
## [[7]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
##
## [[8]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
##
## [[9]]
## [1] "label" "x" "y" "just"
## [5] "hjust" "vjust" "rot" "check.overlap"
## [9] "name" "gp" "vp"
# We are interested in the labels
lapply(v, function(i) i$label)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## [1] "129"
##
## [[6]]
## [1] "102"
##
## [[7]]
## [1] "156"
##
## [[8]]
## [1] "With Control"
##
## [[9]]
## [1] "Tumour only"
# Over-write labels (5 to 7 chosen by manual check of
# labels) in foo only
v[[5]]$label <- paste(setdiff(cd2, ci2), collapse = "\n")
# in baa only
v[[6]]$label <- paste(setdiff(ci2, cd2), collapse = "\n")
# intesection
v[[7]]$label <- paste(intersect(cd2, ci2), collapse = "\n")
# plot
grid.newpage()
grid.draw(v)